Using generators to get numpy chunks out of TAQ data



In [1]:

    
from glob import glob
import raw_taq
import pandas as pd



In [2]:

    
import numpy as np
from statistics import mode

def print_stats(chunk):
    #find the max bid price
    max_price = max(chunk['Bid_Price'])

    #find the min bid price
    min_price = min(chunk['Bid_Price'])

    #find the mean of bid price
    avg_price = np.mean(chunk['Bid_Price'])

    #find the mod of bid price
    try:
        mod_price = mode(chunk['Bid_Price'])
    except StatisticsError:
        mod_price = np.nan
        
    #find the sd of bid price
    sd_price = np.std(chunk['Bid_Price'])

    print("Max bid price: ", max_price, "\n", "Min bid price: ", min_price, "\n", 
          "Mean bid price: ", avg_price, "\n", "Mod bid price: ", mod_price, "\n",
          "Standard deviation bid price: ", sd_price)



In [7]:

    
# You can run this if you update the raw_taq.py file
from importlib import reload
reload(raw_taq)









    Out[7]:





<module 'raw_taq' from '/Users/dav/Projects/dlab-finance/pynbbo/raw_taq.py'>

Here, we grab whatever BBO file we can find



In [5]:

    
# I grab the [0]'th fname in the glob
fname = glob('../local_data/EQY_US_ALL_BBO_*.zip')[0]
test_run = raw_taq.TAQ2Chunks(fname)



In [6]:

    
chunk_gen = test_run.convert_taq(20)



In [26]:

    
type(chunk_gen)









    Out[26]:





generator



In [7]:

    
# You can get one chunk this way
chunk = next(chunk_gen)
chunk[0]









    Out[7]:





(b'P', b'A               ', 0.0, 0, 0.0, 0, b'R', b'P', b'P', 14, b'2', b'2', b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391676960.901)



In [28]:

    
# If you want just the type
chunk.dtype









    Out[28]:





dtype([('Exchange', 'S1'), ('Symbol', 'S16'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1'), ('Time', '<f8')])



In [65]:

    
# Numpy record arrays support string indexing to get columns
print(chunk['Bid_Price'])
print(chunk["Ask_Price"])









    



[  0.     0.     0.    41.9   54.07  57.43  56.07   0.     0.     0.     0.
   0.     0.     0.     0.     0.     0.     0.     0.     0.  ]
[  0.    72.94  60.76  60.76  60.76  60.76  64.    63.29   0.    63.3    0.
  63.3    0.    63.29   0.    63.3    0.    63.3    0.    63.3 ]



In [66]:

    
# Numeric indexing gives a row
chunk[0]









    Out[66]:





(b'P', b'A               ', 0.0, 0, 0.0, 0, b'R', b'P', b'P', 14, b'2', b'2', b' ', b'C', b' ', b' ', b' ', b' ', b' ', b' ', b' ', 1391676960.901)



In [31]:

    
# And you can do both
chunk['Bid_Price'][6]









    Out[31]:





56.07



In [32]:

    
# Or
chunk[6]['Bid_Price']









    Out[32]:





56.07

You can also easily convert numpy record arrays to pandas dataframes easily



In [13]:

    
chunk_df = pd.DataFrame(chunk)



In [14]:

    
chunk_df









    Out[14]:






  
    
      
      Exchange
      Symbol
      Bid_Price
      Bid_Size
      Ask_Price
      Ask_Size
      Quote_Condition
      Bid_Exchange
      Ask_Exchange
      Sequence_Number
      ...
      Quote_Cancel_Correction
      Source_of_Quote
      Retail_Interest_Indicator_RPI
      Short_Sale_Restriction_Indicator
      LULD_BBO_Indicator_CQS
      LULD_BBO_Indicator_UTP
      FINRA_ADF_MPID_Indicator
      SIP_generated_Message_Identifier
      National_BBO_LULD_Indicator
      Time
    
  
  
    
      0
      b'P'
      b'A               '
      0.00
      0
      0.00
      0
      b'R'
      b'P'
      b'P'
      14
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391677e+09
    
    
      1
      b'P'
      b'A               '
      0.00
      0
      72.94
      27
      b'R'
      b'P'
      b'P'
      76255
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      2
      b'P'
      b'A               '
      0.00
      0
      60.76
      10
      b'R'
      b'P'
      b'P'
      76256
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      3
      b'P'
      b'A               '
      41.90
      27
      60.76
      10
      b'R'
      b'P'
      b'P'
      76257
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      4
      b'P'
      b'A               '
      54.07
      27
      60.76
      10
      b'R'
      b'P'
      b'P'
      76258
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      5
      b'P'
      b'A               '
      57.43
      1
      60.76
      10
      b'R'
      b'P'
      b'P'
      78938
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      6
      b'K'
      b'A               '
      56.07
      3
      64.00
      1
      b'R'
      b'K'
      b'K'
      81017
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      7
      b'T'
      b'A               '
      0.00
      0
      63.29
      1
      b'R'
      b'T'
      b'T'
      81225
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      8
      b'T'
      b'A               '
      0.00
      0
      0.00
      0
      b'R'
      b'T'
      b'T'
      81598
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
    
      9
      b'T'
      b'A               '
      0.00
      0
      63.30
      1
      b'R'
      b'T'
      b'T'
      81606
      ...
      b' '
      b'C'
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      b' '
      1.391691e+09
    
  

10 rows × 22 columns



In [15]:

    
# note that time is not correctly parsed yet:
chunk_df.Time









    Out[15]:





0    1.391677e+09
1    1.391691e+09
2    1.391691e+09
3    1.391691e+09
4    1.391691e+09
5    1.391691e+09
6    1.391691e+09
7    1.391691e+09
8    1.391691e+09
9    1.391691e+09
Name: Time, dtype: float64

Goal: Compute some summary statistics across a few securities in the TAQ file

Processing an entire TAQ file will take a long time. So, maybe just run through the chunks for the first two securities (you can then exit out of a loop once you see the third security / symbol).

A complete approach



In [10]:

    
chunk.dtype









    Out[10]:





dtype([('Exchange', 'S1'), ('Symbol', 'S16'), ('Bid_Price', '<f8'), ('Bid_Size', '<i4'), ('Ask_Price', '<f8'), ('Ask_Size', '<i4'), ('Quote_Condition', 'S1'), ('Bid_Exchange', 'S1'), ('Ask_Exchange', 'S1'), ('Sequence_Number', '<i8'), ('National_BBO_Ind', 'S1'), ('NASDAQ_BBO_Ind', 'S1'), ('Quote_Cancel_Correction', 'S1'), ('Source_of_Quote', 'S1'), ('Retail_Interest_Indicator_RPI', 'S1'), ('Short_Sale_Restriction_Indicator', 'S1'), ('LULD_BBO_Indicator_CQS', 'S1'), ('LULD_BBO_Indicator_UTP', 'S1'), ('FINRA_ADF_MPID_Indicator', 'S1'), ('SIP_generated_Message_Identifier', 'S1'), ('National_BBO_LULD_Indicator', 'S1'), ('Time', '<f8')])



In [8]:

    
fname = glob('../local_data/EQY_US_ALL_BBO_*.zip')[0]
local_taq = raw_taq.TAQ2Chunks(fname)

chunk_gen = local_taq.convert_taq(20)
first_chunk = next(chunk_gen)
curr_symbol = first_chunk['Symbol_root'][0]

accum = pd.DataFrame(first_chunk)

processed_symbols = 0

for chunk in chunk_gen:
    where_symbol = curr_symbol == chunk['Symbol_root']
    if where_symbol.all():
        accum.append(pd.DataFrame(chunk))
    else:
        same = chunk[where_symbol]
        accum.append(pd.DataFrame(same))
        
        # Compute the stats
        print('Current symbol:', curr_symbol, len(curr_symbol), 'records')
        print_stats(accum)
        processed_symbols += 1
        if processed_symbols > 3:
            break
        
        diff = chunk[~where_symbol]
        accum = pd.DataFrame(diff)
        curr_symbol = accum.Symbol_root[0]



In [9]:

    
b'AA              ' == b'AA              '









    Out[9]:





True

some simple examples of how generator functions work



In [16]:

    
def simple_fun(l):
    for item in l:
        yield item



In [17]:

    
simple_gen = simple_fun(['a', 'b', 1, 2])



In [18]:

    
type(simple_gen)









    Out[18]:





generator



In [19]:

    
next(simple_gen)









    Out[19]:





'a'



In [20]:

    
for item in simple_fun(['a', 'b', 1, 2]):
    print(item)



In [ ]:

	Exchange	Symbol	Bid_Price	Bid_Size	Ask_Price	Ask_Size	Quote_Condition	Bid_Exchange	Ask_Exchange	Sequence_Number	...	Quote_Cancel_Correction	Source_of_Quote	Retail_Interest_Indicator_RPI	Short_Sale_Restriction_Indicator	LULD_BBO_Indicator_CQS	LULD_BBO_Indicator_UTP	FINRA_ADF_MPID_Indicator	SIP_generated_Message_Identifier	National_BBO_LULD_Indicator	Time
0	b'P'	b'A '	0.00	0	0.00	0	b'R'	b'P'	b'P'	14	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391677e+09
1	b'P'	b'A '	0.00	0	72.94	27	b'R'	b'P'	b'P'	76255	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
2	b'P'	b'A '	0.00	0	60.76	10	b'R'	b'P'	b'P'	76256	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
3	b'P'	b'A '	41.90	27	60.76	10	b'R'	b'P'	b'P'	76257	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
4	b'P'	b'A '	54.07	27	60.76	10	b'R'	b'P'	b'P'	76258	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
5	b'P'	b'A '	57.43	1	60.76	10	b'R'	b'P'	b'P'	78938	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
6	b'K'	b'A '	56.07	3	64.00	1	b'R'	b'K'	b'K'	81017	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
7	b'T'	b'A '	0.00	0	63.29	1	b'R'	b'T'	b'T'	81225	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
8	b'T'	b'A '	0.00	0	0.00	0	b'R'	b'T'	b'T'	81598	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09
9	b'T'	b'A '	0.00	0	63.30	1	b'R'	b'T'	b'T'	81606	...	b' '	b'C'	b' '	b' '	b' '	b' '	b' '	b' '	b' '	1.391691e+09